Setup
Necessary Libraries
library(MicrobeR)
library(dada2)
library(vegan)
library(ape)
library(philr)
library(lmerTest)
library(tidyverse)
library(readxl)
library(phyloseq)
library(ggtree)
library(qiime2R)
library(ALDEx2)
library(gghighlight)
library(ggpubr)
library(cowplot)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.0.0 ggpubr_0.4.0 gghighlight_0.3.0 ALDEx2_1.18.0
## [5] qiime2R_0.99.34 ggtree_2.0.4 phyloseq_1.30.0 readxl_1.3.1
## [9] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.4
## [13] readr_1.3.1 tidyr_1.0.2 tibble_3.0.1 ggplot2_3.3.0
## [17] tidyverse_1.3.0 lmerTest_3.1-2 lme4_1.1-23 Matrix_1.2-18
## [21] philr_1.12.0 ape_5.3 vegan_2.5-6 lattice_0.20-38
## [25] permute_0.9-5 dada2_1.14.1 Rcpp_1.0.4 MicrobeR_0.3.2
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.6 Hmisc_4.4-0
## [3] fastmatch_1.1-0 plyr_1.8.6
## [5] igraph_1.2.5 lazyeval_0.2.2
## [7] splines_3.6.3 BiocParallel_1.20.1
## [9] GenomeInfoDb_1.22.1 rtk_0.2.5.8
## [11] digest_0.6.25 foreach_1.5.0
## [13] htmltools_0.5.0 fansi_0.4.1
## [15] checkmate_2.0.0 magrittr_1.5
## [17] memoise_1.1.0 cluster_2.1.0
## [19] DECIPHER_2.14.0 openxlsx_4.1.5
## [21] Biostrings_2.54.0 modelr_0.1.6
## [23] RcppParallel_5.0.0 matrixStats_0.56.0
## [25] jpeg_0.1-8.1 colorspace_1.4-1
## [27] blob_1.2.1 rvest_0.3.5
## [29] haven_2.2.0 xfun_0.13
## [31] crayon_1.3.4 RCurl_1.98-1.2
## [33] jsonlite_1.6.1 survival_3.1-8
## [35] phangorn_2.5.5 iterators_1.0.12
## [37] glue_1.4.0 gtable_0.3.0
## [39] zlibbioc_1.32.0 XVector_0.26.0
## [41] DelayedArray_0.12.3 car_3.0-8
## [43] Rhdf5lib_1.8.0 BiocGenerics_0.32.0
## [45] abind_1.4-5 scales_1.1.0
## [47] DBI_1.1.0 rstatix_0.6.0
## [49] htmlTable_1.13.3 viridisLite_0.3.0
## [51] tidytree_0.3.3 foreign_0.8-75
## [53] bit_1.1-15.2 Formula_1.2-3
## [55] stats4_3.6.3 DT_0.13
## [57] truncnorm_1.0-8 htmlwidgets_1.5.1
## [59] httr_1.4.1 RColorBrewer_1.1-2
## [61] acepack_1.4.1 ellipsis_0.3.0
## [63] pkgconfig_2.0.3 NADA_1.6-1.1
## [65] nnet_7.3-12 dbplyr_1.4.3
## [67] tidyselect_1.0.0 rlang_0.4.5
## [69] reshape2_1.4.4 munsell_0.5.0
## [71] cellranger_1.1.0 tools_3.6.3
## [73] cli_2.0.2 generics_0.0.2
## [75] RSQLite_2.2.0 ade4_1.7-15
## [77] broom_0.5.6 evaluate_0.14
## [79] biomformat_1.14.0 yaml_2.2.1
## [81] knitr_1.28 bit64_0.9-7
## [83] fs_1.4.1 zip_2.0.4
## [85] nlme_3.1-144 xml2_1.3.2
## [87] rstudioapi_0.11 compiler_3.6.3
## [89] curl_4.3 plotly_4.9.2.1
## [91] png_0.1-7 ggsignif_0.6.0
## [93] zCompositions_1.3.4 reprex_0.3.0
## [95] treeio_1.10.0 statmod_1.4.34
## [97] stringi_1.4.6 nloptr_1.2.2.1
## [99] multtest_2.42.0 vctrs_0.2.4
## [101] pillar_1.4.3 lifecycle_0.2.0
## [103] BiocManager_1.30.10 data.table_1.12.8
## [105] bitops_1.0-6 GenomicRanges_1.38.0
## [107] R6_2.4.1 latticeExtra_0.6-29
## [109] hwriter_1.3.2 ShortRead_1.44.3
## [111] rio_0.5.16 gridExtra_2.3
## [113] IRanges_2.20.2 codetools_0.2-16
## [115] boot_1.3-24 MASS_7.3-51.5
## [117] assertthat_0.2.1 picante_1.8.1
## [119] rhdf5_2.30.1 SummarizedExperiment_1.16.1
## [121] withr_2.2.0 GenomicAlignments_1.22.1
## [123] Rsamtools_2.2.3 S4Vectors_0.24.4
## [125] GenomeInfoDbData_1.2.2 mgcv_1.8-31
## [127] parallel_3.6.3 hms_0.5.3
## [129] rpart_4.1-15 quadprog_1.5-8
## [131] grid_3.6.3 minqa_1.2.4
## [133] rmarkdown_2.1 rvcheck_0.1.8
## [135] carData_3.0-4 base64enc_0.1-3
## [137] numDeriv_2016.8-1.1 Biobase_2.46.0
## [139] lubridate_1.7.8
Data Import
SVtab<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_table.qza")$data %>% as.data.frame()
SVseq<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_sequences.qza")$data %>% as.data.frame() %>% rename(c("SV"="x"))
taxonomy<-read.delim("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_d2taxonomy.txt", header=T)
lookup<-(SVseq %>% rownames_to_column("ASV")) %>%
left_join(taxonomy, by="ASV") %>%
column_to_rownames("ASV")
## Warning: Column `ASV` joining character vector and factor, coercing into
## character vector
SVtree_denovo <- read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_denovotree.qza")
#Here I am parsing out the different experiments.
IDEOmeta <- read.csv("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_mouse_metadata.csv", header = T)
IDEOmeta1and2 <- IDEOmeta %>% filter((Experiment == "IDEO1"|Experiment=="IDEO2") & !is.na(SequencingID))
rownames(IDEOmeta1and2) <- IDEOmeta1and2$SequencingID
IDEO12SVtab <- SVtab[,rownames(IDEOmeta1and2)]
Theme
# setting the color palette choice and statistical comparisons for the ggpubr package.
Whitecolor="#E69F00"
Chinesecolor="#0072B2"
statgroups <- list(c("EA", "W"))
# theme for pcoas
theme_pcoa<- function () {
theme_classic(base_size=10, base_family="Helvetica") +
theme(axis.text = element_text(size=8, color = "black"),
axis.title = element_text(size=10, color="black"),
legend.text = element_text(size=8, color = "black"),
legend.title = element_text(size=10, color = "black"),
plot.title = element_text(size=10, color="black")) +
theme(panel.border = element_rect(color="black", size=1, fill=NA))
}
# theme for boxplots
theme_boxplot<- function () {
theme_bw(base_size=10, base_family="Helvetica") +
theme(axis.text.x = element_text(size=10, color = "black"),
axis.text.y = element_text(size=8, color="black"),
axis.title.x= element_blank(),
axis.title.y = element_text(size=10, color="black"),
legend.position = "none",
panel.grid = element_blank(),
strip.text = element_text(size=10, color = "black"))
}
Quality Filter
#Looking at at the number of sequences per sample
histogram(colSums(IDEO12SVtab))

IDEO12SVtab<-Confidence.Filter(IDEO12SVtab, MINSAMPS = 2, MINREADS=10, VERBOSE=TRUE)
lookup<-lookup[rownames(IDEO12SVtab),]
IDEO12tree <- drop.tip(SVtree_denovo$data,SVtree_denovo$data$tip.label[!SVtree_denovo$data$tip.label %in% rownames(IDEO12SVtab)])
Normalized Tables
PHILR<-philr(
t(IDEO12SVtab+1),
IDEO12tree,
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt'
)
## Building Sequential Binary Partition from Tree...
## Building Contrast Matrix...
## Transforming the Data...
## Calculating ILR Weights...
## Warning in calculate.blw(tree, method = "sum.children"): Note: a total of 22
## tip edges with zero length have been replaced with a small pseudocount of the
## minimum non-zero edge length ( 5e-09 ).
PCoA including donors
# filter mouse samples
f.meta<-IDEOmeta1and2
rownames(f.meta)<-f.meta$SequencingID
f.SVtab<-IDEO12SVtab[,rownames(f.meta)]
f.PHILR<-PHILR[rownames(f.meta),]
#Calculate PCo axis values
euclid<-ape::pcoa(dist(f.PHILR, method="euclidean"))
var.PCo1 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[2], digits=2, nsmall=1)
var.PCo3 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[3], digits=2, nsmall=1)
#Scree plot
euclid$values %>%
as.data.frame() %>%
rownames_to_column("PC") %>%
mutate(PC=as.numeric(PC)) %>%
select(PC, Relative_eig) %>%
mutate(PercentVariation=Relative_eig*100) %>%
ggplot(aes(x=PC, y=PercentVariation)) +
geom_point()

#Plot pcoa
euclid$vectors %>%
as.data.frame() %>%
rownames_to_column("SequencingID") %>%
left_join(f.meta) %>%
ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity, shape=Organism)) +
geom_point(size=2) +
scale_shape_manual(values=c(23,21)) +
theme_pcoa() +
xlab(paste0("PCo1 [",var.PCo1,"%]")) +
ylab(paste0("PCo2 [",var.PCo2,"%]")) +
scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor))
## Joining, by = "SequencingID"
## Warning: Column `SequencingID` joining character vector and factor, coercing
## into character vector

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/LFPP12_pcoa_wdonors.pdf", height=2.5, width=3.5, useDingbats=F)
PCoA
# filter mouse samples
f.meta<-IDEOmeta1and2 %>% filter(Organism=="Mouse")
rownames(f.meta)<-f.meta$SequencingID
f.SVtab<-IDEO12SVtab[,rownames(f.meta)]
f.PHILR<-PHILR[rownames(f.meta),]
#Adonis
adonis<-vegan::adonis(dist(f.PHILR, method="euclidean") ~ Ethnicity, data=f.meta)
adonis$aov.tab
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Ethnicity 1 2104.5 2104.46 13.707 0.38388 0.001 ***
## Residuals 22 3377.6 153.53 0.61612
## Total 23 5482.1 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Calculate PCo axis values
euclid<-ape::pcoa(dist(f.PHILR, method="euclidean"))
var.PCo1 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[2], digits=2, nsmall=1)
var.PCo3 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[3], digits=2, nsmall=1)
#Scree plot
euclid$values %>%
as.data.frame() %>%
rownames_to_column("PC") %>%
mutate(PC=as.numeric(PC)) %>%
select(PC, Relative_eig) %>%
mutate(PercentVariation=Relative_eig*100) %>%
ggplot(aes(x=PC, y=PercentVariation)) +
geom_point()

#Plot pcoa
euclid$vectors %>%
as.data.frame() %>%
rownames_to_column("SequencingID") %>%
left_join(f.meta) %>%
ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity, group=SampleID)) +
geom_line(linetype="dashed", color="grey50") +
geom_point(size=2, shape=21) +
theme_pcoa() +
xlab(paste0("PCo1 [",var.PCo1,"%]")) +
ylab(paste0("PCo2 [",var.PCo2,"%]")) +
scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
ggtitle(paste0("p=",adonis$aov.tab$`Pr(>F)`,", r2=",signif(adonis$aov.tab$R2[1]),digits=3))
## Joining, by = "SequencingID"
## Warning: Column `SequencingID` joining character vector and factor, coercing
## into character vector

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/LFPP12_pcoa.pdf", height=2.5, width=3.2, useDingbats=F)
Alpha Diversity
alphadiv <- data.frame(
Shannon = vegan::diversity(Subsample.Table(IDEO12SVtab), index = "shannon", MARGIN = 2),
FaithsPD = picante::pd(t(Subsample.Table(IDEO12SVtab)), IDEO12tree, include.root = F)$PD,
Richness = specnumber(Subsample.Table(IDEO12SVtab), MARGIN = 2)) %>% #Calc richness on subsampled table
rownames_to_column("SequencingID") %>%
left_join(IDEOmeta1and2) %>%
select (SampleID, Organism, Ethnicity, Shannon, FaithsPD, Richness) %>%
pivot_longer(cols=Shannon:Richness, names_to="alpha_metric")
## Subsampling feature table to 8383 , currently has 419 taxa.
## ...sampled to 8383 reads with 417 taxa
## Subsampling feature table to 8383 , currently has 419 taxa.
## ...sampled to 8383 reads with 417 taxa
## Subsampling feature table to 8383 , currently has 419 taxa.
## ...sampled to 8383 reads with 417 taxa
## Joining, by = "SequencingID"
## Warning: Column `SequencingID` joining character vector and factor, coercing
## into character vector
# plot alpha div metrics
alphadiv %>%
filter(Organism=="Mouse") %>%
ggplot(aes(x=Ethnicity, y=value, fill=Ethnicity)) +
geom_boxplot(outlier.shape=NA) +
geom_jitter(shape=21, size=2, height=0, width = 0.05) +
facet_wrap(~alpha_metric, scales="free", nrow=1) +
theme_boxplot() +
theme(panel.grid = element_blank(), axis.title.x = element_blank(), axis.text.x=element_text(size=10), axis.title = element_text(size=10), axis.text.y=element_text(size=8)) +
scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
ylab("Alpha diversity") +
stat_compare_means(comparison=statgroups, method="wilcox")
## Warning in wilcox.test.default(c(101, 84, 105, 106, 102, 113, 71, 153, 152, :
## cannot compute exact p-value with ties

# stats
alphadiv %>%
filter(Organism=="Mouse") %>%
group_by(alpha_metric) %>%
do(
broom::glance(wilcox.test(value~Ethnicity, data=., paired=F))
) %>%
ungroup() -> results.alpha
## Warning in wilcox.test.default(x = c(101, 84, 105, 106, 102, 113, 71, 153, :
## cannot compute exact p-value with ties
Nice.Table(results.alpha)
# plot richness only
alphadiv %>%
filter(Organism=="Mouse") %>%
filter(alpha_metric=="Richness") %>%
ggplot(aes(x=Ethnicity, y=value, fill=Ethnicity)) +
geom_boxplot(outlier.shape=NA) +
geom_jitter(shape=21, size=2, height=0, width = 0.1) +
theme_boxplot() +
scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
ylab("Richness") +
stat_compare_means(comparison=statgroups, method="wilcox")
## Warning in wilcox.test.default(c(101, 84, 105, 106, 102, 113, 71, 153, 152, :
## cannot compute exact p-value with ties

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/LFPP12_richness.pdf", height=2.5, width=2.3, useDingbats=F)
Volcano plot
# aldex (ASVs)
totest<-Fraction.Filter(f.SVtab,0.0005)
## [1] "Filtering table at a min fraction of 5e-04 of feature table..."
## [1] "...There are 862126 reads and 419 features"
## [1] "...After filtering there are 836976 reads and 181 OTUs"
ttests.asvs <- aldex(totest, f.meta$Ethnicity, mc.samples = 128, denom = "all", test = "t", effect = TRUE, include.sample.summary = TRUE)
ttests.asvs %>% filter(we.eBH<0.2) -> sigres
print(paste("There are", nrow(sigres), "differentially abundant ASVs in the gnoto mice (FDR<0.2)."))
## [1] "There are 138 differentially abundant ASVs in the gnoto mice (FDR<0.2)."
# aldex (genus level)
genera<-Summarize.Taxa(f.SVtab, lookup)$Genus
f.genera<-Fraction.Filter(genera,0.0005)
## [1] "Filtering table at a min fraction of 5e-04 of feature table..."
## [1] "...There are 862126 reads and 140 features"
## [1] "...After filtering there are 854127 reads and 84 OTUs"
ttests.genera <- aldex(f.genera, f.meta$Ethnicity, mc.samples = 128, denom = "all", test = "t", effect = TRUE, include.sample.summary = TRUE)
results<-
ttests.genera %>%
rownames_to_column("Feature") %>%
select(Feature,
logFC_Between=diff.btw,
logFC_Within=diff.win,
Abundance_EA=rab.win.EA,
Abundance_W=rab.win.W,
Pvalue=we.ep,
FDR=we.eBH,
EffectSize=effect) %>%
mutate(logFC_EA_vs_W=-(logFC_Between)) %>%
separate(Feature,sep=";",into=c("K","P","C","O","F","Genus"),remove=F)
# significant results
sigres <- results %>% filter(FDR<0.1 & abs(logFC_Between)>1)
Nice.Table(sigres)
# volcano plot
sigres_subset <-
sigres %>% filter(Genus %in% c("Bacteroides","Butyricicoccus","Blautia","Parabacteroides","[Ruminococcus] torques group", "[Clostridium] innocuum group"))
ggplot(results, aes(x = logFC_EA_vs_W, y = -log10(FDR), label=Genus)) +
geom_point(size=1) +
gghighlight(FDR < 0.1 & abs(logFC_Between)>1) +
geom_text(data=sigres_subset,aes(label=Genus), size=1, hjust=1, vjust=1.5) +
theme_bw() +
xlab("Log2 fold difference (EA/W)")

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/LFPP12_volplot_fdr0.1.pdf", height=2.5, width=2.5, useDingbats=F)
Plot Bacteroides
genera %>%
Make.CLR() %>%
as.data.frame() %>%
rownames_to_column("Genus") %>%
filter(grepl("Bacteroides",Genus)) %>%
pivot_longer(-Genus, names_to = "ID", values_to = "CLR") %>%
left_join(f.meta %>% rownames_to_column("ID")) %>%
ggplot(aes(x=Ethnicity, y=CLR, fill=Ethnicity)) +
geom_boxplot(outlier.shape=NA) +
geom_jitter(shape=21, size=2, height=0, width=0.1) +
theme_boxplot() +
scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
ylab("Bacteroides abundance (CLR)") +
stat_compare_means(comparison=statgroups, method="wilcox")
## Joining, by = "ID"

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/LFPP12_bacteroides.pdf", height=2.5, width=2.3, useDingbats=F)
Phenotypes
metabolic1<-read_excel("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_LFPP1_collated_metabolic.xlsx") %>% mutate(Expt="LFPP1")
metabolic2<-read_excel("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_LFPP2_collated_metabolic.xlsx") %>% mutate(Expt="LFPP2")
metabolic<-bind_rows(metabolic1,metabolic2)
metabolic %>%
mutate(Ethnicity=gsub("Chinese","EA",Ethnicity)) %>%
mutate(Ethnicity=gsub("White","W",Ethnicity)) %>%
pivot_longer(cols=BodyWt_End:Lean_Pct_Change, names_to = "metric") %>%
filter(metric %in% c("BodyWt_Change","Fat_Pct_Change","Lean_Pct_Change")) %>%
ggplot(aes(x=Ethnicity, y=value, fill=Ethnicity)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(shape=21, size=2, height = 0, width=0.1) +
facet_wrap(~metric, scales="free") +
theme_boxplot() +
scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
stat_compare_means(comparisons = statgroups, method="wilcox") +
ylab("% change")
## Warning in wilcox.test.default(c(0.900000000000002, 2.2, 1.5, 2, 2, 1.8, :
## cannot compute exact p-value with ties

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/LFPP12_metabolic.pdf", height=2.5, width=5.5, useDingbats=F)